In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import geopandas as gpd
import json
from shapely.geometry import Point
from IPython.display import Image, display
import seaborn as sns
from pysal.lib import weights
from libpysal.weights import w_subset
from pysal.explore import esda
from numpy.random import seed
from pysal.viz import splot
from splot.esda import plot_moran
from splot import esda as esdaplot
import contextily
from sklearn.neighbors import NearestNeighbors
In [2]:
# Import preprocessed data
folder_path = "data/final_processed_data/"
neighborhood_stats = gpd.read_file(folder_path + "neighborhood_stats.gpkg")
airbnb_gdf = gpd.read_file(folder_path + "airbnb_listings.gpkg")
airbnb_prices = gpd.read_file(folder_path + "airbnb_prices.gpkg")
airbnb_prices_entirespace = gpd.read_file(folder_path + "airbnb_prices_entirespace.gpkg")
airbnb_prices_entirespace_highseason = gpd.read_file(folder_path + "airbnb_prices_entirespace_highseason.gpkg")
airbnb_prices_entirespace_lowseason = gpd.read_file(folder_path + "airbnb_prices_entirespace_lowseason.gpkg")
poi_gdf = gpd.read_file(folder_path + "tourism_pois.gpkg")
museums_gdf = gpd.read_file(folder_path + "museum_pois.gpkg")
galleries_gdf = gpd.read_file(folder_path + "gallery_pois.gpkg")
monuments_gdf = gpd.read_file(folder_path + "monument_pois.gpkg")
buurten_gdf = gpd.read_file(folder_path + "neighborhood_polygons.gpkg")

if neighborhood_stats.crs != "EPSG:28992" or airbnb_gdf.crs != "EPSG:28992" or airbnb_prices.crs != "EPSG:28992" or airbnb_prices_entirespace.crs != "EPSG:28992" or airbnb_prices_entirespace_highseason.crs != "EPSG:28992" or airbnb_prices_entirespace_lowseason.crs != "EPSG:28992" or poi_gdf.crs != "EPSG:28992" or museums_gdf.crs != "EPSG:28992" or galleries_gdf.crs != "EPSG:28992" or monuments_gdf.crs != "EPSG:28992" or buurten_gdf.crs != "EPSG:28992":
    print("Convert all datasets to EPSG:28992")
else:
    print("All dataset CRS are set to EPSG:28992")
All dataset CRS are set to EPSG:28992

Determining k-value for K-Nearest Neighbors¶

When considering a suitable k-value range for our autocorrelation analysis, we can find an optimal range where each increment in k yields the smallest change in median and maximum distance for each neighborhood polygon in the city of Amsterdam. The k values with the smallest chanegs between increments is when k=3 to k=12, so we can use this range to test for the most optimal Moran's I value when computing the global autocorrelation statistic, as defined in the function which we can deploy for each attribute we test in the analysis.

In [3]:
# Determine best k value for KNN
coords = np.array(list(zip(neighborhood_stats.geometry.centroid.x, neighborhood_stats.geometry.centroid.y)))
for k in range(1, 16):
    nn = NearestNeighbors(n_neighbors=k+1).fit(coords)  # +1 includes self
    dists, _ = nn.kneighbors(coords)
    kth = dists[:, -1]  # distance to k-th neighbor
    print(f"k={k}: median dist={np.median(kth):.0f}, max dist={np.max(kth):.0f}")

#k_value = 5

def set_k_value(df, attr, lowest_k, highest_k):
    for k in range(lowest_k, highest_k+1):
        w_k = weights.KNN.from_dataframe(df, k=k)
        w_k.transform = "R"
        mask = df[attr].notna()
        valid_ids = df.index[mask].tolist()
        w_sub = w_subset(w_k, valid_ids)
        w_sub.transform = "R"
        y = df.loc[w_sub.id_order, attr].to_numpy()
        moran = esda.Moran(y, w_sub)
        print(f"k={k:2d} -> Moran's I = {moran.I:.3f}, p = {moran.p_sim:.3f}")

#print("")
#print("Best k-value for KNN based on dataset polygons: " + str(k_value))
k=1: median dist=341, max dist=2625
k=2: median dist=422, max dist=3343
k=3: median dist=500, max dist=3896
k=4: median dist=555, max dist=4153
k=5: median dist=628, max dist=4405
k=6: median dist=679, max dist=4548
k=7: median dist=715, max dist=4681
k=8: median dist=768, max dist=5040
k=9: median dist=803, max dist=5060
k=10: median dist=854, max dist=5111
k=11: median dist=885, max dist=5325
k=12: median dist=931, max dist=5353
k=13: median dist=968, max dist=5505
k=14: median dist=1018, max dist=5542
k=15: median dist=1058, max dist=5578

Defining functions to perform autocorrelation analysis¶

Since there are multiple angles to investigate regarding the airbnb and tourism spatial datasets, it will be useful to define functions to do so and repeat them later by changing the input parameters. This way it will be easier to compare all airbnbs vs only airbnb listing the entire space, to compare high season vs low season, to compare specifix tourism POIs like museums, etc.

In [4]:
def prep_data(df, k_value):
    # copy dataframe
    gdf = df.copy()
    # Generate W from the GeoDataFrame
    w = weights.KNN.from_dataframe(gdf, k=k_value)
    # Row-standardization
    w.transform = "R"
    return gdf, w

def spatial_lag(df, att_col, att_lag_col, weight_matrix, k_value):
    df[att_lag_col] = weights.spatial_lag.lag_spatial(weight_matrix, df[att_col])
    return df
    
def plot_spatial_lag(df, att_col, att_lag_col, k_value, title, figsizew=12, figsizeh=6, colormap="plasma", scheme="quantiles", edgecolor="grey", linewidth=0.25, transparency=1, legendon=True):
    f, axs = plt.subplots(1, 2, figsize=(figsizew, figsizeh))
    ax1, ax2 = axs
    df.plot(column=att_col,cmap=colormap,scheme=scheme,k=k_value,edgecolor=edgecolor,linewidth=linewidth,alpha=transparency,legend=legendon,ax=ax1)
    ax1.set_axis_off()
    ax1.set_title(str(title))
    df.plot(column=att_lag_col,cmap=colormap,scheme=scheme,k=k_value,edgecolor=edgecolor,linewidth=linewidth,alpha=transparency,legend=legendon,ax=ax2)
    ax2.set_axis_off()
    ax2.set_title(str(title) + " - Spatial Lag")
    plt.show()

def standardize_values(df, w, att_col, att_lag_col, att_col_std, att_lag_col_std):
    # Standardize for Moran
    df[att_col_std] = df[att_col] - df[att_col].mean()
    df[att_lag_col_std] = weights.lag_spatial(w, df[att_col_std])
    return df

def plot_standardized_values(df, att_col_std, att_lag_col_std, title, figsizew=3, figsizeh=3, confidence=25, marker=".", pointcolor="blue", linecolor="red", label_frac=0.75, label_fontsize=10, label_weight="bold", label_color="red"):
    f, ax = plt.subplots(1, figsize=(figsizew, figsizeh))
    sns.regplot(x=att_col_std,y=att_lag_col_std,ci=confidence,data=df,marker=marker,scatter_kws={"color": pointcolor},line_kws={"color": linecolor})
    ax.axvline(0, c="k", alpha=0.5)
    ax.axhline(0, c="k", alpha=0.5)
    ax.set_title(str(title) + " - Standardized Values")
    x_min, x_max = ax.get_xlim()
    y_min, y_max = ax.get_ylim()
    frac = label_frac
    x_right = frac * x_max
    x_left  = frac * x_min
    y_top   = frac * y_max
    y_bot   = frac * y_min
    ax.text(x_right, y_top, "HH",ha="center", va="center",fontsize=label_fontsize, fontweight=label_weight, color=label_color)
    ax.text(x_right, y_bot, "HL",ha="center", va="center",fontsize=label_fontsize, fontweight=label_weight, color=label_color)
    ax.text(x_left, y_bot, "LL",ha="center", va="center",fontsize=label_fontsize, fontweight=label_weight, color=label_color)
    ax.text(x_left, y_top, "LH",ha="center", va="center",fontsize=label_fontsize, fontweight=label_weight, color=label_color)
    plt.show()

def handle_missing_values(df, w, att_col, att_col_std, att_lag_col_std, w_col, std_w_col):
    # neighborhoods that have a *non-missing* average price
    mask = df[att_col].notna()
    # IDs that we want to keep in the weights object
    valid_ids = df.index[mask].tolist()
    # Subset the weights to only those ids
    w_sub = w_subset(w, valid_ids)
    # Row-standardize (good practice for Moran's I)
    w_sub.transform = "R"
    # y must be in the same order as w_sub.id_order
    y = df.loc[w_sub.id_order, att_col].to_numpy()
    
    # filter gdf with only polygons with prices
    df_nonmissingpolygons = df.loc[w_sub.id_order].copy()
    # 3. Spatial lags and standardized variable (only on df_nonmissingpolygons)
    df_nonmissingpolygons[w_col] = weights.lag_spatial(w_sub, df_nonmissingpolygons[att_col])
    df_nonmissingpolygons[att_col_std] = (df_nonmissingpolygons[att_col] - df_nonmissingpolygons[att_col].mean())
    df_nonmissingpolygons[std_w_col] = weights.lag_spatial(w_sub, df_nonmissingpolygons[att_col_std])

    return w_sub, y, df_nonmissingpolygons

def run_global_MoranI(df, w, att=None):
    if att is None:
        moran = esda.moran.Moran(df, w)
    else:
        moran = esda.moran.Moran(df[att], w)
    print("Moran's I:", moran.I)
    print("p-value:", moran.p_sim)
    return moran

def plot_global_MoranI(moran):
    plot_moran(moran)

def run_LISA(df, w, att=None, mask=None, siglevel=0.05):
    if att is None:
        lisa = esda.moran.Moran_Local(df, w)
        if mask is not None:
            df_output = mask.copy()
    else:
        lisa = esda.moran.Moran_Local(df[att], w)
        df_output = df.copy()
    df_output["I"] = lisa.Is
    df_output["Q"] = lisa.q
    quad_map = {1: "High-High",2: "Low-High",3: "Low-Low",4: "High-Low",}
    df_output["Quad"] = pd.Series(lisa.q, index=df_output.index).map(quad_map)
    df_output["p_value"] = lisa.p_sim
    df_output["significance_level"] = df_output["p_value"] < siglevel
    df_output["cluster"] = np.where(df_output["significance_level"],df_output["Quad"],"Not significant",)
    return lisa, df_output

def plot_LISA(lisa, df, w, mask=None, siglevel=0.05, figsizew=10, figsizeh=5, sigcolor="green"):
    gdf_to_plot = mask if mask is not None else df
    quad_map = {1: "HighHigh", 2: "LowHigh", 3: "LowLow", 4: "HighLow"}
    quad_labels = pd.Series(lisa.q, index=gdf_to_plot.index).map(quad_map)
    order = ["HighHigh", "LowLow", "HighLow", "LowHigh"]
    all_counts = quad_labels.value_counts().reindex(order)
    sig_mask = lisa.p_sim < siglevel
    quad_labels_sig = quad_labels[sig_mask]
    sig_counts = quad_labels_sig.value_counts().reindex(order)
    sig_string = "Statistically Significant (α = " + str(siglevel) + ")"
    quad_df = pd.DataFrame({"All Observations": all_counts, sig_string: sig_counts}).fillna(0)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(figsizew, figsizeh))
    # Rugplot
    sns.kdeplot(lisa.Is, ax=ax1, label="Density", color="grey")
    sns.rugplot(lisa.Is, ax=ax1, label="LISA Observations (Std Dev)", color="red", height=0.1)
    ax1.set_title("Local Moran's I")
    ax1.legend()
    # Histogram
    quad_df.plot(kind="bar", color=["grey", sigcolor], ax=ax2)
    ax2.set_title(f"LISA quadrant counts (α = {siglevel})")
    ax2.set_xlabel("Observation Value & Neighbor Likeness")
    ax2.set_ylabel("Count")
    plt.tight_layout()
    plt.show()

def map_LISA(lisa,df,w,mask=None,siglevel=0.05,localstats=True,quad_categories=True,significance=True,clustermap=True,figsze=6, 
             stats_cmap="cividis", stats_scheme="quantiles", stats_edgecolor="grey", stats_linewidth=0.25, stats_transparency=1, stats_legend=True,
            quad_c1="#306844", quad_c2="#98FBCB", quad_c3="#ff8503", quad_c4="#FFEE8C", quad_edgecolor="grey", quad_linewidth=0.25, quad_legend="upper right",
            sig_cmap="BrBG",sig_linewidth=0.25,sig_edgecolor="white",sig_transparency=0.55,sig_legend=True,
            clust_c1="#306844", clust_c2="#98FBCB", clust_c3="#ff8503", clust_c4="#FFEE8C", clust_c5="#d9d9d9", clust_edgecolor="grey", clust_linewidth=0.25, clust_legend="upper right"):
    gdf_to_plot = mask if mask is not None else df

    panels = []
    if localstats:
        panels.append("localstats")
    if quad_categories:
        panels.append("quadrant")
    if significance:
        panels.append("significance")
    if clustermap:
        panels.append("cluster")

    n = len(panels)
    if n == 1:
        nrows, ncols = 1, 1
    elif n == 2:
        nrows, ncols = 1, 2
    else:
        nrows, ncols = 2, 2

    figsize = (figsze * ncols, figsze * nrows)
    f, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
    axs = np.atleast_1d(axs).ravel()

    quad_colors = {"High-High": quad_c1, "High-Low": quad_c2, "Low-Low": quad_c3, "Low-High": quad_c4}
    cluster_colors = {"Not significant": clust_c5, "High-High": clust_c1, "High-Low": clust_c2, "Low-Low": clust_c3, "Low-High": clust_c4}

    for ax, panel in zip(axs, panels):
        if panel == "localstats":
            gdf_to_plot.assign(Is=lisa.Is).plot(column="Is",cmap=stats_cmap,scheme=stats_scheme,k=5,edgecolor=stats_edgecolor,linewidth=stats_linewidth,alpha=stats_transparency,legend=stats_legend,ax=ax)
            ax.set_title("Local Statistics", y=1)

        elif panel == "quadrant":
            legend_handles = []
            for label, color in quad_colors.items():
                sel = gdf_to_plot["Quad"] == label
                if sel.any():
                    gdf_to_plot.loc[sel].plot(ax=ax,color=color,edgecolor=quad_edgecolor,linewidth=quad_linewidth)
                    # Make an explicit legend patch for this label
                    legend_handles.append(mpatches.Patch(color=color, label=label))
            ax.set_title("LISA Quadrants", y=1)
            if legend_handles:
                ax.legend(handles=legend_handles,title="Quadrant",loc=quad_legend,fontsize=6,)

        elif panel == "significance":
            labels = (pd.Series(1 * (lisa.p_sim < siglevel), index=gdf_to_plot.index).map({1: "Significant", 0: "Non-Significant"}))
            gdf_to_plot.assign(cl=labels).plot(column="cl",categorical=True,k=2,cmap=sig_cmap,linewidth=sig_linewidth,edgecolor=sig_edgecolor,alpha=sig_transparency,legend=True,ax=ax)
            ax.set_title(f"Statistical Significance (α = {siglevel})", y=1)

        elif panel == "cluster":
            legend_handles = []
            for label, color in cluster_colors.items():
                sel = gdf_to_plot["cluster"] == label
                if sel.any():
                    gdf_to_plot.loc[sel].plot(ax=ax,color=color,edgecolor=clust_edgecolor,linewidth=clust_linewidth)
                    legend_handles.append(mpatches.Patch(color=color, label=label))
            ax.set_title(f"Moran Cluster Map (α = {siglevel})", y=1)
            if legend_handles:
                ax.legend(handles=legend_handles,title="Cluster",loc=clust_legend,fontsize=6,
                )


        ax.set_axis_off()

    f.tight_layout()
    plt.show()

Airbnb Autocorrelation Analysis¶

Average Airbnb Prices per neighborhood¶

Describe the attribute we are analyzing

We used a K-nearest neighbors spatial weights matrix with k = 5. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 5 and declining gradually for larger k.

In [5]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "avg_price_airbnb", "avg_price_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 5
set_k_value(gdf_copy, att_col, 3, 12)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.326, p = 0.001
k= 4 -> Moran's I = 0.329, p = 0.001
k= 5 -> Moran's I = 0.333, p = 0.001
k= 6 -> Moran's I = 0.326, p = 0.001
k= 7 -> Moran's I = 0.319, p = 0.001
k= 8 -> Moran's I = 0.317, p = 0.001
k= 9 -> Moran's I = 0.308, p = 0.001
k=10 -> Moran's I = 0.307, p = 0.001
k=11 -> Moran's I = 0.298, p = 0.001
k=12 -> Moran's I = 0.293, p = 0.001

Avg Airbnb Prices - Spatial Lag¶

Describe spatial lag

In [6]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Avg Airbnb Prices", colormap="YlGn")
No description has been provided for this image

Observe what we see in the plot

Avg Airbnb Prices - Standardized Data¶

Explain why we standardize (normalize) and how

In [7]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Avg Airbnb Prices", pointcolor="green")
No description has been provided for this image

Observe what we see in the plot

Avg Airbnb Prices - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [8]:
# Global Moran's I
w_sub, y, gdf_masked = handle_missing_values(gdf, weight_matrix, att_col, att_col_std, att_lag_col_std, w_col, std_w_col)
moran = run_global_MoranI(y, w_sub)
plot_global_MoranI(moran)
Moran's I: 0.3326235709944917
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Avg Airbnb Prices - Local Indicators of Spatial Association (LISA)¶

In [9]:
# LISA
lisa, cluster_df = run_LISA(y, w_sub, att=None, mask=gdf_masked)
airbnb_prices_clusterdf = cluster_df.copy()
plot_LISA(lisa, y, w_sub, mask=gdf_masked)
No description has been provided for this image

Observe what we see in the plot

In [10]:
map_LISA(lisa, y, w_sub, mask=cluster_df)
No description has been provided for this image

Observe what we see in the plot


Airbnb prices for private listings¶

Describe the attribute we are analyzing

We used a K-nearest neighbors spatial weights matrix with k = 4. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 4 and declining gradually for larger k.

In [11]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "avg_price_airbnb_entirespace", "avg_price_airbnb_entirespace_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 4
set_k_value(gdf_copy, att_col, 3, 12)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
('WARNING: ', 294, ' is an island (no neighbors)')
('WARNING: ', 353, ' is an island (no neighbors)')
k= 3 -> Moran's I = 0.347, p = 0.001
k= 4 -> Moran's I = 0.355, p = 0.001
k= 5 -> Moran's I = 0.333, p = 0.001
k= 6 -> Moran's I = 0.312, p = 0.001
k= 7 -> Moran's I = 0.304, p = 0.001
k= 8 -> Moran's I = 0.302, p = 0.001
k= 9 -> Moran's I = 0.297, p = 0.001
k=10 -> Moran's I = 0.287, p = 0.001
k=11 -> Moran's I = 0.282, p = 0.001
k=12 -> Moran's I = 0.274, p = 0.001

Avg Private Airbnb Prices - Spatial Lag¶

Describe spatial lag

In [12]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Avg Private Airbnb Prices", colormap="YlGn")
No description has been provided for this image

Observe what we see in the plot

Avg Private Airbnb Prices - Standardized Data¶

Explain why we standardize (normalize) and how

In [13]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Avg Private Airbnb Prices", pointcolor="green")
No description has been provided for this image

Observe what we see in the plot

Avg Private Airbnb Prices - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [14]:
# Global Moran's I
w_sub, y, gdf_masked = handle_missing_values(gdf, weight_matrix, att_col, att_col_std, att_lag_col_std, w_col, std_w_col)
moran = run_global_MoranI(y, w_sub)
plot_global_MoranI(moran)
Moran's I: 0.35490103888586577
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Avg Private Airbnb Prices - Local Indicators of Spatial Association (LISA)¶

In [15]:
# LISA
lisa, cluster_df = run_LISA(y, w_sub, att=None, mask=gdf_masked)
airbnb_prices_entirespace_clusterdf = cluster_df.copy()
plot_LISA(lisa, y, w_sub, mask=gdf_masked)
No description has been provided for this image

Observe what we see in the plot

In [16]:
map_LISA(lisa, y, w_sub, mask=cluster_df)
No description has been provided for this image

Observe what we see in the plot


Airbnb prices during high-season¶

Describe the attribute we are analyzing

We used a K-nearest neighbors spatial weights matrix with k = 5. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 5 and declining gradually for larger k.

In [17]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "avg_price_airbnb_entirespace_highseason", "avg_price_airbnb_entirespace_highseason_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 5
set_k_value(gdf_copy, att_col, 3, 12)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
('WARNING: ', 232, ' is an island (no neighbors)')
('WARNING: ', 236, ' is an island (no neighbors)')
('WARNING: ', 294, ' is an island (no neighbors)')
('WARNING: ', 422, ' is an island (no neighbors)')
k= 3 -> Moran's I = 0.298, p = 0.001
('WARNING: ', 422, ' is an island (no neighbors)')
k= 4 -> Moran's I = 0.309, p = 0.001
k= 5 -> Moran's I = 0.285, p = 0.001
k= 6 -> Moran's I = 0.285, p = 0.001
k= 7 -> Moran's I = 0.294, p = 0.001
k= 8 -> Moran's I = 0.289, p = 0.001
k= 9 -> Moran's I = 0.278, p = 0.001
k=10 -> Moran's I = 0.265, p = 0.001
k=11 -> Moran's I = 0.267, p = 0.001
k=12 -> Moran's I = 0.264, p = 0.001

Avg Airbnb Prices during High-season - Spatial Lag¶

Describe spatial lag

In [18]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, " Avg Airbnb Prices during High-season", colormap="YlGn")
No description has been provided for this image

Observe what we see in the plot

Avg Airbnb Prices during High-season - Standardized Data¶

Explain why we standardize (normalize) and how

In [19]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, " Avg Airbnb Prices during High-season", pointcolor="green")
No description has been provided for this image

Observe what we see in the plot

Avg Airbnb Prices during High-season - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [20]:
# Global Moran's I
w_sub, y, gdf_masked = handle_missing_values(gdf, weight_matrix, att_col, att_col_std, att_lag_col_std, w_col, std_w_col)
moran = run_global_MoranI(y, w_sub)
plot_global_MoranI(moran)
Moran's I: 0.28498650491364647
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Avg Airbnb Prices during High-season - Local Indicators of Spatial Association (LISA)¶

In [21]:
# LISA
lisa, cluster_df = run_LISA(y, w_sub, att=None, mask=gdf_masked)
airbnb_prices_entirespace_highseason_clusterdf = cluster_df.copy()
plot_LISA(lisa, y, w_sub, mask=gdf_masked)
No description has been provided for this image

Observe what we see in the plot

In [22]:
map_LISA(lisa, y, w_sub, mask=cluster_df)
No description has been provided for this image

Observe what we see in the plot


Airbnb Prices during Low-season¶

Describe the attribute we are analyzing

We used a K-nearest neighbors spatial weights matrix with k = 17. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–25 produced similar clustering patterns, with global Moran’s I peaking at k = 17 and declining gradually for larger k.

In [23]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "avg_price_airbnb_entirespace_lowseason", "avg_price_airbnb_entirespace_lowseason_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 17
set_k_value(gdf_copy, att_col, 15, 20)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
('WARNING: ', 293, ' is an island (no neighbors)')
k=15 -> Moran's I = 0.164, p = 0.001
('WARNING: ', 293, ' is an island (no neighbors)')
k=16 -> Moran's I = 0.168, p = 0.001
k=17 -> Moran's I = 0.176, p = 0.001
k=18 -> Moran's I = 0.172, p = 0.001
k=19 -> Moran's I = 0.169, p = 0.001
k=20 -> Moran's I = 0.159, p = 0.001

Avg Airbnb Prices during Low-season - Spatial Lag¶

Describe spatial lag

In [24]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Avg Airbnb Prices during Low Season", colormap="YlGn")
No description has been provided for this image

Observe what we see in the plot

Avg Airbnb Prices during Low-season - Standardized Data¶

Explain why we standardize (normalize) and how

In [25]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Avg Airbnb Prices during Low Season", pointcolor="green")
No description has been provided for this image

Observe what we see in the plot

Airbnb Prices during Low-season - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [26]:
# Global Moran's I
w_sub, y, gdf_masked = handle_missing_values(gdf, weight_matrix, att_col, att_col_std, att_lag_col_std, w_col, std_w_col)
moran = run_global_MoranI(y, w_sub)
plot_global_MoranI(moran)
Moran's I: 0.17589438808237565
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Avg Airbnb Prices during Low-season - Local Indicators of Spatial Association (LISA)¶

In [27]:
# LISA
lisa, cluster_df = run_LISA(y, w_sub, att=None, mask=gdf_masked)
airbnb_prices_entirespace_lowseason_clusterdf = cluster_df.copy()
plot_LISA(lisa, y, w_sub, mask=gdf_masked)
No description has been provided for this image

Observe what we see in the plot

In [28]:
map_LISA(lisa, y, w_sub, mask=cluster_df)
No description has been provided for this image

Observe what we see in the plot


Airbnb Clusters¶

Describe the attribute we are analyzing

In [29]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_airbnbs", "count_airbnbs_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 4
set_k_value(gdf_copy, att_col, 3, 12)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.419, p = 0.001
k= 4 -> Moran's I = 0.422, p = 0.001
k= 5 -> Moran's I = 0.413, p = 0.001
k= 6 -> Moran's I = 0.408, p = 0.001
k= 7 -> Moran's I = 0.405, p = 0.001
k= 8 -> Moran's I = 0.395, p = 0.001
k= 9 -> Moran's I = 0.387, p = 0.001
k=10 -> Moran's I = 0.381, p = 0.001
k=11 -> Moran's I = 0.375, p = 0.001
k=12 -> Moran's I = 0.367, p = 0.001

We used a K-nearest neighbors spatial weights matrix with k = 4. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 4 and declining gradually for larger k.

Airbnb Count per neighborhood - Spatial Lag¶

Describe spatial lag

In [30]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Number of Airbnbs per neighborhood")
No description has been provided for this image

Observe what we see in the plot

Airbnb Count per neighborhood - Standardized Data¶

Explain why we standardize (normalize) and how

In [31]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Number of Airbnbs per neighborhood")
No description has been provided for this image

Observe what we see in the plot

Airbnb Count per neighborhood - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [32]:
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_airbnbs")
plot_global_MoranI(moran)
Moran's I: 0.42245557023524244
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Airbnb Count per neighborhood - Local Indicators of Spatial Association (LISA)¶

In [33]:
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_airbnbs")
count_airbnbs_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="pink")
No description has been provided for this image

Observe what we see in the plot

In [34]:
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="PuRd", sig_cmap="tab10", sig_transparency=0.6, quad_c1="#B22222", quad_c2="#F8C8DC", quad_c3="#3D426B", quad_c4="#7894B0", clust_c1="#B22222", clust_c2="#F8C8DC", clust_c3="#3D426B", clust_c4="#7894B0")
No description has been provided for this image

Observe what we see in the plot


Tourism Autocorrelation Analysis¶

Tourism POI Clusters¶

Describe the attribute we are analyzing

In [35]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_pois", "count_pois_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 10
set_k_value(gdf_copy, att_col, 3, 12)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.120, p = 0.004
k= 4 -> Moran's I = 0.150, p = 0.001
k= 5 -> Moran's I = 0.144, p = 0.001
k= 6 -> Moran's I = 0.138, p = 0.001
k= 7 -> Moran's I = 0.151, p = 0.001
k= 8 -> Moran's I = 0.139, p = 0.001
k= 9 -> Moran's I = 0.147, p = 0.001
k=10 -> Moran's I = 0.155, p = 0.001
k=11 -> Moran's I = 0.147, p = 0.001
k=12 -> Moran's I = 0.144, p = 0.001

We used a K-nearest neighbors spatial weights matrix with k = 10. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 10 and declining gradually for larger k.

Tourism POI count per neighborhood - Spatial Lag¶

Describe spatial lag

In [36]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Tourism POIs per neighborhood")
No description has been provided for this image

Observe what we see in the plot

Tourism POI count per neighborhood - Standardized Data¶

Explain why we standardize (normalize) and how

In [37]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Tourism POIs per neighborhood", figsizew=6, figsizeh=3, pointcolor="purple", linecolor="cyan", label_color="blue")
No description has been provided for this image

Observe what we see in the plot

Tourism POI count per neighborhood - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [38]:
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_pois")
plot_global_MoranI(moran)
Moran's I: 0.15470330400512608
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Tourism POI count per neighborhood - Local Indicators of Spatial Association (LISA)¶

In [39]:
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_pois")
count_pois_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="purple")
No description has been provided for this image

Observe what we see in the plot

In [40]:
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="cool", sig_cmap="PuOr", quad_c1="#BF40BF", quad_c2="#bfbcfc", quad_c3="#010057", quad_c4="#c1d8f0", clust_c1="#BE00FE", clust_c2="#bfbcfc", clust_c3="#010057", clust_c4="#c1d8f0")
No description has been provided for this image

Observe what we see in the plot


Museum Clusters¶

Describe the attribute we are analyzing

In [41]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_museums", "count_museums_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 7
set_k_value(gdf_copy, att_col, 3, 12)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.173, p = 0.003
k= 4 -> Moran's I = 0.238, p = 0.001
k= 5 -> Moran's I = 0.248, p = 0.001
k= 6 -> Moran's I = 0.257, p = 0.001
k= 7 -> Moran's I = 0.281, p = 0.001
k= 8 -> Moran's I = 0.271, p = 0.001
k= 9 -> Moran's I = 0.267, p = 0.001
k=10 -> Moran's I = 0.270, p = 0.001
k=11 -> Moran's I = 0.261, p = 0.001
k=12 -> Moran's I = 0.257, p = 0.001

We used a K-nearest neighbors spatial weights matrix with k = 7. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 5 and declining gradually for larger k.

Museum count per neighborhood - Spatial Lag¶

Describe spatial lag

In [42]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Museum Count per neighborhood")
No description has been provided for this image

Observe what we see in the plot

Museum count per neighborhood - Standardized Data¶

Explain why we standardize (normalize) and how

In [43]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Museum Count per neighborhood", figsizeh=3, figsizew=6, pointcolor="purple", marker="^", linecolor="cyan", label_color="blue")
No description has been provided for this image

Observe what we see in the plot

Museum count per neighborhood - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [44]:
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_museums")
plot_global_MoranI(moran)
Moran's I: 0.2814197362236021
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Museum count per neighborhood - Local Indicators of Spatial Association (LISA)¶

In [45]:
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_museums")
count_museums_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="purple")
No description has been provided for this image

Observe what we see in the plot

In [46]:
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="cool", sig_cmap="PuOr", quad_c1="#BF40BF", quad_c2="#bfbcfc", quad_c3="#010057", quad_c4="#c1d8f0", clust_c1="#BE00FE", clust_c2="#bfbcfc", clust_c3="#010057", clust_c4="#c1d8f0")
No description has been provided for this image

Observe what we see in the plot


Art Gallery Clusters¶

Describe the attribute we are analyzing

In [47]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_galleries", "count_galleries_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 3
set_k_value(gdf_copy, att_col, 3, 12)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.443, p = 0.001
k= 4 -> Moran's I = 0.373, p = 0.001
k= 5 -> Moran's I = 0.352, p = 0.001
k= 6 -> Moran's I = 0.348, p = 0.001
k= 7 -> Moran's I = 0.335, p = 0.001
k= 8 -> Moran's I = 0.302, p = 0.001
k= 9 -> Moran's I = 0.274, p = 0.001
k=10 -> Moran's I = 0.266, p = 0.001
k=11 -> Moran's I = 0.252, p = 0.001
k=12 -> Moran's I = 0.247, p = 0.001

We used a K-nearest neighbors spatial weights matrix with k = 3. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 12 and declining gradually for larger k.

Art Gallery count per neighborhood - Spatial Lag¶

Describe spatial lag

In [48]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Art Galleries per neighborhood")
No description has been provided for this image

Observe what we see in the plot

Art Gallery count per neighborhood - Standardized Spatial Lag¶

Explain why we standardize (normalize) and how

In [49]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Art Galleries per neighborhood", figsizeh=3, figsizew=6, pointcolor="purple", marker="D", linecolor="cyan", label_color="blue")
No description has been provided for this image

Observe what we see in the plot

Art Gallery count per neighborhood - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [50]:
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_galleries")
plot_global_MoranI(moran)
Moran's I: 0.4434317494717987
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Art Gallery count per neighborhood - Local Indicators of Spatial Association (LISA)¶

In [51]:
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_galleries")
count_galleries_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="purple")
No description has been provided for this image

Observe what we see in the plot

In [52]:
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="cool", sig_cmap="PuOr", quad_c1="#BF40BF", quad_c2="#bfbcfc", quad_c3="#010057", quad_c4="#c1d8f0", clust_c1="#BE00FE", clust_c2="#bfbcfc", clust_c3="#010057", clust_c4="#c1d8f0")
No description has been provided for this image

Observe what we see in the plot


Monuments Clusters¶

Describe the attribute we are analyzing

In [53]:
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_monuments", "count_monumentss_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))

k_value = 5
set_k_value(gdf_copy, att_col, 3, 12)

# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.630, p = 0.001
k= 4 -> Moran's I = 0.631, p = 0.001
k= 5 -> Moran's I = 0.652, p = 0.001
k= 6 -> Moran's I = 0.631, p = 0.001
k= 7 -> Moran's I = 0.647, p = 0.001
k= 8 -> Moran's I = 0.621, p = 0.001
k= 9 -> Moran's I = 0.612, p = 0.001
k=10 -> Moran's I = 0.610, p = 0.001
k=11 -> Moran's I = 0.593, p = 0.001
k=12 -> Moran's I = 0.592, p = 0.001

We used a K-nearest neighbors spatial weights matrix with k = 5. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 5 and declining gradually for larger k.

Monument count per neighborhood - Spatial Lag¶

Describe spatial lag

In [54]:
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Historic Monuments per Neighborhood", colormap="YlOrBr")
No description has been provided for this image

Observe what we see in the plot

Monument count per neighborhood - Standardized Data¶

Explain why we standardize (normalize) and how

In [55]:
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Historic Monuments per Neighborhood", figsizeh=3, figsizew=5, pointcolor="orange", marker="p", linecolor="green", label_color="brown")
No description has been provided for this image

Observe what we see in the plot

Monument count per neighborhood - Global Moran's I¶

Describe what global autocorrelation is and what Moran's I is computing

In [56]:
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_monuments")
plot_global_MoranI(moran)
Moran's I: 0.6518726591100805
p-value: 0.001
No description has been provided for this image

Observe what we see in the plot

Monument count per neighborhood - Local Indicators of Spatial Association (LISA)¶

In [57]:
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_monuments")
count_monuments_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="orange")
No description has been provided for this image

Observe what we see in the plot

In [58]:
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="viridis", sig_cmap="Paired", quad_c1="#ff4d01", quad_c2="#F9AD6f", quad_c3="#010057", quad_c4="#c1d8f0", clust_c1="#ff4d01", clust_c2="#F9AD6f", clust_c3="#010057", clust_c4="#c1d8f0")
No description has been provided for this image

Observe what we see in the plot


Concluding Analysis¶

Heatmaps¶

In [83]:
# Add cluster designation to each neighborhood polygon from the different datatsets
cluster_heatmap_gdf = neighborhood_stats.copy()
airbnb_price_cluster = (airbnb_prices_entirespace_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_airbnbprices"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(airbnb_price_cluster, on="CBS_Buurtcode", how="left")
airbnbs_cluster = (count_airbnbs_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_airbnbs"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(airbnbs_cluster, on="CBS_Buurtcode", how="left")
pois_cluster = (count_pois_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_pois"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(pois_cluster, on="CBS_Buurtcode", how="left")
museums_cluster = (count_museums_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_museums"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(museums_cluster, on="CBS_Buurtcode", how="left")
galleries_cluster = (count_galleries_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_galleries"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(galleries_cluster, on="CBS_Buurtcode", how="left")
monuments_cluster = (count_monuments_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_monuments"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(monuments_cluster, on="CBS_Buurtcode", how="left")

# define function to map hotspots and colspots with contextly
def create_heatmap(title, attribute, hotspotcolor="#d7191c", coldspotcolor="#2b83ba", transparency=0.6):
    hh_ll = cluster_heatmap_gdf[cluster_heatmap_gdf[attribute].isin(["High-High", "Low-Low"])].copy()
    hh_ll = hh_ll.to_crs(epsg=3857)
    
    fig, ax = plt.subplots(figsize=(8, 8))
    hh_ll[hh_ll[attribute] == "High-High"].plot(ax=ax,color=hotspotcolor,edgecolor="white",linewidth=0.3,alpha=transparency)
    hh_ll[hh_ll[attribute] == "Low-Low"].plot(ax=ax,color=coldspotcolor,edgecolor="white",linewidth=0.3,alpha=transparency)
    
    contextily.add_basemap(ax,crs=hh_ll.crs,source=contextily.providers.CartoDB.Positron)
    handles = [mpatches.Patch(color=hotspotcolor, label="High " + str(title) + " clusters"),mpatches.Patch(color=coldspotcolor, label="Low " + str(title) + " clusters")]
    ax.legend(handles=handles, loc="lower left", fontsize=8, title="LISA clusters")
    
    ax.set_axis_off()
    plt.title("Hotspots and Coldspots of " + str(title) + " clusters over Amsterdam")
    plt.tight_layout()
    plt.show()

Airbnb Price hotspots and coldspots¶

In [84]:
create_heatmap("Airbnb Price", "cluster_airbnbprices")
No description has been provided for this image

Airbnb Location hotspots and coldspots¶

In [85]:
create_heatmap("Airbnb Locations", "cluster_airbnbs")
No description has been provided for this image

Tourism POI hotspots and coldspots¶

In [86]:
create_heatmap("Tourism POIs", "cluster_pois")
No description has been provided for this image

Museum hotspots and coldspots¶

In [87]:
create_heatmap("Museum", "cluster_museums")
No description has been provided for this image

Art Gallery hotspots and coldspots¶

In [88]:
create_heatmap("Art Gallery", "cluster_galleries")
No description has been provided for this image

Monuments hotspots and coldspots¶

In [89]:
create_heatmap("Monument", "cluster_monuments")
No description has been provided for this image

Intersections of Spatial Correlation¶

Airbnb price cluster intersection with any tourism-related cluster¶

In [90]:
# Identify Airbnb clusters that match any of the tourism clusters
tourism_cols = ["cluster_pois","cluster_museums","cluster_galleries","cluster_monuments"]
match_any = (cluster_heatmap_gdf[tourism_cols].eq(cluster_heatmap_gdf["cluster_airbnbprices"], axis=0).any(axis=1))
cluster_heatmap_gdf["cluster_airbnb_match_any_tourism"] = np.where(match_any,cluster_heatmap_gdf["cluster_airbnbprices"],np.nan)

match_gdf = cluster_heatmap_gdf.dropna(subset=["cluster_airbnb_match_any_tourism"]).copy()
match_gdf = match_gdf.to_crs(epsg=3857)
color_map = {"High-High": "#d7191c","Low-Low":   "#2b83ba","High-Low":  "#fdae61","Low-High":  "#c1d8f0"}

fig, ax = plt.subplots(figsize=(8, 8))
handles = []
for label, color in color_map.items():
    sel = match_gdf["cluster_airbnb_match_any_tourism"] == label
    if sel.any():
        match_gdf.loc[sel].plot(ax=ax,color=color,edgecolor="white",linewidth=0.3,alpha=0.6)
        handles.append(mpatches.Patch(color=color, label=label))

contextily.add_basemap(ax,crs=match_gdf.crs,source=contextily.providers.CartoDB.Voyager)

if handles:
    ax.legend(handles=handles,title="Airbnb cluster matching\na tourism cluster",loc="lower right",fontsize=8)

ax.set_axis_off()
plt.title("Where Airbnb LISA clusters match\nat least one tourism LISA cluster", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [91]:
# show details of matching airbnb and tourism clusters
show_match_gdf = match_gdf.loc[match_gdf['cluster_airbnbprices'] != "Not significant"]
show_match_gdf.loc[:, ['Buurtcode', 'Buurt', 'count_airbnbs', 'avg_price_airbnb_entirespace', 'count_pois', 'count_museums', 'count_galleries', 'count_monuments', 'cluster_airbnbprices', 'cluster_pois', 'cluster_museums', 'cluster_galleries', 'cluster_monuments']].sort_values(by='avg_price_airbnb_entirespace', ascending=False)
Out[91]:
Buurtcode Buurt count_airbnbs avg_price_airbnb_entirespace count_pois count_museums count_galleries count_monuments cluster_airbnbprices cluster_pois cluster_museums cluster_galleries cluster_monuments
164 AE04 Nes e.o. 15 431.750000 3 1 0 80 High-High High-High High-High Not significant High-High
5 AD03 Nieuwendijk-Noord 25 380.200000 10 1 1 99 High-High High-High High-High Not significant High-High
9 AD06 Spuistraat-Zuid 25 375.000000 6 1 1 145 High-High High-High High-High Not significant High-High
160 AE01 Kop Zeedijk 21 373.333333 8 2 0 187 High-High High-High High-High Low-High High-High
10 AD07 Begijnhofbuurt 12 352.666667 14 1 0 160 High-High High-High High-High Not significant High-High
163 AE03 Burgwallen-Oost 54 343.363636 7 4 2 285 High-High High-High High-High Not significant High-High
446 AB11 Passeerdersgrachtbuurt 22 340.857143 0 0 0 53 High-High Not significant Low-High Not significant High-High
414 KN01 Prinses Irenebuurt 11 337.000000 5 0 0 5 High-High High-High Not significant Not significant Not significant
161 AE02 Oude Kerk e.o. 34 332.500000 8 1 1 139 High-High High-High High-High High-High High-High
399 KJ04 Minervabuurt-Zuid 7 329.500000 3 0 0 1 High-High High-High Not significant Not significant Not significant
7 AD05 Nieuwe Kerk e.o. 35 305.923077 17 2 1 133 High-High High-High High-High High-High High-High
1 AC03 Felix Meritisbuurt 63 302.411765 3 0 1 460 High-High High-High Low-High High-High High-High
173 AF08 Zuiderkerkbuurt 34 302.100000 3 0 0 136 High-High High-High Low-High Not significant High-High
159 EV02 Vondelparkbuurt-Oost 36 295.846154 8 0 0 20 High-High High-High Low-High Not significant Not significant
2 AC04 Leidsegracht-Noord 16 295.200000 4 3 0 130 High-High Not significant Not significant Not significant High-High
167 AF02 Lastage 31 287.692308 0 0 0 122 High-High Not significant Low-High Not significant High-High
185 AH06 Leidsebuurt-Zuidoost 8 278.250000 8 2 0 7 High-High High-High Not significant Not significant Low-High
171 AF06 Uilenburg 17 278.083333 2 0 0 32 High-High Low-High Not significant Not significant High-High
162 AH02 Leidsebuurt-Noordoost 46 269.727273 5 1 2 109 High-High Not significant High-High Not significant High-High
266 NQ10 Durgerdam 8 266.500000 3 0 0 69 High-Low High-Low Not significant Not significant Not significant
8 AH01 Leidsebuurt-Noordwest 14 265.125000 1 0 0 29 High-High Not significant Low-High Not significant High-High
165 AE05 BG-terrein e.o. 28 263.142857 16 2 0 150 High-High High-High High-High Not significant High-High
213 NC02 Tuindorp Oostzaan-Oost 42 223.705882 1 0 0 8 Low-Low Low-Low Not significant Not significant Not significant
224 NF01 Buiksloterdijk-West 2 205.000000 0 0 0 10 Low-Low Low-Low Not significant Not significant Not significant
327 FN04 Staalmanbuurt 16 201.666667 1 0 0 0 Low-Low Low-Low Not significant Not significant Not significant
212 NC01 Tuindorp Oostzaan-West 6 198.000000 0 0 0 0 Low-Low Low-Low Not significant Not significant Not significant
221 NE04 Banne-Zuidoost 23 192.818182 0 0 0 3 Low-Low Low-Low Not significant Not significant Not significant
223 NE06 Marjoleinterrein 2 180.000000 1 0 0 0 Low-Low Low-Low Not significant Not significant Not significant
244 NK01 Bloemenbuurt-Noord 44 178.444444 1 0 0 0 Low-Low Low-Low Not significant Not significant Not significant
109 MJ03 Rieteilanden-West 27 172.285714 1 0 0 0 Low-High Low-Low Not significant Low-High Not significant
218 NE01 Banne-Noordwest 7 142.666667 0 0 0 0 Low-Low Low-Low Not significant Not significant Not significant
In [ ]:
 

Airbnb price cluster intersection with every tourism-related cluster¶

In [92]:
# Identify High-High clusters for all attributes
cols = ["cluster_airbnbprices","cluster_pois","cluster_museums","cluster_galleries","cluster_monuments"]
all_HH = (cluster_heatmap_gdf[cols] == "High-High").all(axis=1)
all_LL = (cluster_heatmap_gdf[cols] == "Low-Low").all(axis=1)
all_HL = (cluster_heatmap_gdf[cols] == "High-Low").all(axis=1)
all_LH = (cluster_heatmap_gdf[cols] == "Low-High").all(axis=1)
cluster_heatmap_gdf["cluster_all"] = np.nan
cluster_heatmap_gdf.loc[all_HH, "cluster_all"] = "High-High"
cluster_heatmap_gdf.loc[all_LL, "cluster_all"] = "Low-Low"
cluster_heatmap_gdf.loc[all_HL, "cluster_all"] = "High-Low"
cluster_heatmap_gdf.loc[all_LH, "cluster_all"] = "Low-High"
cluster_heatmap_gdf["cluster_all"].value_counts()

all5 = cluster_heatmap_gdf.dropna(subset=["cluster_all"]).copy()
all5 = all5.to_crs(epsg=3857)
color_map = {"High-High": "#d7191c","Low-Low":"#2b83ba","High-Low":"#fdae61","Low-High":"#abdda4"}

fig, ax = plt.subplots(figsize=(12, 6))
handles = []
for label, color in color_map.items():
    sel = all5["cluster_all"] == label
    if sel.any():
        all5.loc[sel].plot(ax=ax,color=color,edgecolor="white",linewidth=0.3,alpha=0.6,)
        handles.append(mpatches.Patch(color=color, label=label))

'''
xmin, ymin, xmax, ymax = all5.total_bounds
dx = (xmax - xmin) * 0.3
dy = (ymax - ymin) * 0.3
ax.set_xlim(xmin - dx, xmax + dx)
ax.set_ylim(ymin - dy, ymax + dy)
'''

contextily.add_basemap(ax,crs=all5.crs,source=contextily.providers.OpenStreetMap.Mapnik)
if handles:ax.legend(handles=handles,title="Highest Airbnb Prices and Tourism Draw",loc="lower left",fontsize=8,)

ax.set_axis_off()
plt.title("Neighborhoods with the same LISA cluster classification for all Tourism and Airbnb Clusters", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [93]:
# show details of most sought after neighborhoods
all5.loc[:, ['Buurtcode', 'Buurt', 'count_airbnbs', 'avg_price_airbnb_entirespace', 'count_pois', 'count_museums', 'count_galleries', 'count_monuments', 'Oppervlakte_m2']]
Out[93]:
Buurtcode Buurt count_airbnbs avg_price_airbnb_entirespace count_pois count_museums count_galleries count_monuments Oppervlakte_m2
7 AD05 Nieuwe Kerk e.o. 35 305.923077 17 2 1 133 83841
161 AE02 Oude Kerk e.o. 34 332.500000 8 1 1 139 92282

As this is the only neihgborhood that is classified as a High-High LISA clusters for all airbnb price clusters, tourism POI clusters, museum clusters, art gallery clusters, and monument clusters; we can estimate that the Oude Kerk e.o. neighborhood is the most sought-after neighborhood for tourism in Amsterdam.

Plotting Airbnb Price vs Tourism¶

In [94]:
# plot airbnb price with tourism count per neighborhood
scatter_df = cluster_heatmap_gdf.copy()
scatter_df["tourism_total"] = scatter_df["count_pois"] + scatter_df["count_monuments"]
scatter_df = scatter_df.dropna(subset=["avg_price_airbnb_entirespace", "tourism_total", "cluster_airbnbprices"])
valid_clusters = ["High-High", "Low-Low", "High-Low", "Low-High"]
scatter_df = scatter_df[scatter_df["cluster_airbnbprices"].isin(valid_clusters)]

cluster_palette = {"High-High": "#d7191c","Low-Low":   "#2b83ba","High-Low":  "#fdae61","Low-High":  "#c1d8f0"}
plt.figure(figsize=(8, 6))
sns.scatterplot(data=scatter_df,x="tourism_total",y="avg_price_airbnb_entirespace",hue="cluster_airbnbprices",palette=cluster_palette,alpha=0.8,edgecolor="white",linewidth=0.5)
sns.regplot(data=scatter_df,x="tourism_total",y="avg_price_airbnb_entirespace",scatter=False,ci=95,color="green")

plt.xlabel("Number of tourism sites")
plt.ylabel("Average Airbnb price")
plt.title("Avg Airbnb Price vs. Tourism Sites per Neighborhood")
plt.ylim(bottom=100)
plt.tight_layout()
plt.show()
No description has been provided for this image

Conclusion¶

The prices of airbnbs are clustered around three areas. First, airbnb prices go up with proximity to the historically- and culturally-rich Center-West canals. Second the clustering of higher airbnb prices certainly occurs around the cities' two major parks, VondelPark and Oosterpark, however this is not true for parks at the edge of the city limits like Rembrandtpark and Amsterdam Bos, which actually have a cluster of lower airbnb prices. Third, there is a clustering of higher airbnb prices near the major bar & clubbing area of Leidseplein.

We notice that the higher prices of airbnbs is more spatially clustered with tourism sites, whereas the LISA autocorrelation perofrmed on the clustering of airbnb locations at the beginning of this analysis showed that airbnb locations were concentrated on the South/SouthWest periphery of the tourism center. The commonality between the autocorrelation of airbnb prices and autocorrelation of airbnb locations, is both of their clustering near around Vondelpark.

In [ ]: